# Always print this out before your assignment
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] ggridges_0.5.3 glmnetUtils_1.1.8
[3] glmnet_4.1-3 Matrix_1.3-4
[5] plotROC_2.2.1 scales_1.1.1
[7] tidyquant_1.0.3 quantmod_0.4.18
[9] TTR_0.24.2 PerformanceAnalytics_2.0.4
[11] xts_0.12.1 zoo_1.8-9
[13] plotly_4.10.0 viridis_0.6.2
[15] viridisLite_0.4.0 pastecs_1.3.21
[17] kableExtra_1.3.4 lubridate_1.8.0
[19] rsample_0.1.1 ggthemes_4.2.4
[21] ggrepel_0.9.1 here_1.0.1
[23] fs_1.5.0 forcats_0.5.1
[25] stringr_1.4.0 dplyr_1.0.7
[27] purrr_0.3.4 readr_2.1.0
[29] tidyr_1.1.4 tibble_3.1.6
[31] ggplot2_3.3.5 tidyverse_1.3.1
[33] knitr_1.36
loaded via a namespace (and not attached):
[1] colorspace_2.0-2 ellipsis_0.3.2 rprojroot_2.0.2
[4] rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3
[7] farver_2.1.0 fansi_0.5.0 xml2_1.3.2
[10] codetools_0.2-18 splines_4.1.2 jsonlite_1.7.2
[13] broom_0.7.10 dbplyr_2.1.1 compiler_4.1.2
[16] httr_1.4.2 backports_1.4.0 assertthat_0.2.1
[19] fastmap_1.1.0 lazyeval_0.2.2 cli_3.1.0
[22] htmltools_0.5.2 tools_4.1.2 gtable_0.3.0
[25] glue_1.5.0 Rcpp_1.0.7 cellranger_1.1.0
[28] jquerylib_0.1.4 vctrs_0.3.8 svglite_2.0.0
[31] nlme_3.1-153 iterators_1.0.13 crosstalk_1.2.0
[34] xfun_0.28 globals_0.14.0 rvest_1.0.2
[37] lifecycle_1.0.1 future_1.23.0 hms_1.1.1
[40] parallel_4.1.2 yaml_2.2.1 curl_4.3.2
[43] gridExtra_2.3 sass_0.4.0 stringi_1.7.5
[46] highr_0.9 foreach_1.5.1 boot_1.3-28
[49] shape_1.4.6 rlang_0.4.12 pkgconfig_2.0.3
[52] systemfonts_1.0.3 evaluate_0.14 lattice_0.20-45
[55] htmlwidgets_1.5.4 labeling_0.4.2 tidyselect_1.1.1
[58] parallelly_1.29.0 plyr_1.8.6 magrittr_2.0.1
[61] R6_2.5.1 generics_0.1.1 DBI_1.1.1
[64] pillar_1.6.4 haven_2.4.3 withr_2.4.2
[67] mgcv_1.8-38 survival_3.2-13 modelr_0.1.8
[70] crayon_1.4.2 Quandl_2.11.0 utf8_1.2.2
[73] tzdb_0.2.0 rmarkdown_2.11 grid_4.1.2
[76] readxl_1.3.1 data.table_1.14.2 reprex_2.0.1
[79] digest_0.6.28 webshot_0.5.2 munsell_0.5.0
[82] bslib_0.3.1 quadprog_1.5-8
getwd()
[1] "/Users/ryanradcliff/Documents/BUS696/BROCODE_Final_Project"
# load all your libraries in this chunk
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')
library("gdata")
gdata: read.xls support for 'XLS' (Excel 97-2004) files
gdata: ENABLED.
gdata: read.xls support for 'XLSX' (Excel 2007+) files
gdata: ENABLED.
Attaching package: ‘gdata’
The following objects are masked from ‘package:xts’:
first, last
The following objects are masked from ‘package:pastecs’:
first, last
The following objects are masked from ‘package:dplyr’:
combine, first, last
The following object is masked from ‘package:purrr’:
keep
The following object is masked from ‘package:stats’:
nobs
The following object is masked from ‘package:utils’:
object.size
The following object is masked from ‘package:base’:
startsWith
# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk.
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children and number of degrees
dataclean <- data %>%
mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
mutate(Birthdate = mdy(Birthdate)) %>%
mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(ID = as.numeric(ID)) %>%
mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>%
mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
ifelse(is.na(Child.2.ID),1,2)))%>%
mutate(ID = as.numeric(ID)) %>%
mutate( nmb_degree = ifelse(is.na(Degree.Type.1),0,
ifelse(is.na(Degree.Type.2),1,2))) %>%
#conferral dates
mutate(Conferral.Date.1 = ifelse(Conferral.Date.1 == "0001-01-01", NA, Conferral.Date.1)) %>%
mutate(Conferral.Date.1 = mdy(Conferral.Date.1)) %>%
mutate(Conferral.Date.1.Age = as.numeric(floor(interval(start= Conferral.Date.1, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Conferral.Date.2 = ifelse(Conferral.Date.2 == "0001-01-01", NA, Conferral.Date.2)) %>%
mutate(Conferral.Date.2 = mdy(Conferral.Date.2)) %>%
mutate(Conferral.Date.2.Age = as.numeric(floor(interval(start= Conferral.Date.2, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Last.Contact.By.Anyone = ifelse(Last.Contact.By.Anyone == "0001-01-01", NA, Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.By.Anyone = mdy(Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.Age = as.numeric(floor(interval(start= Last.Contact.By.Anyone, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(HH.First.Gift.Date = ifelse(HH.First.Gift.Date == "0001-01-01", NA, HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Date = mdy(HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Age = as.numeric(floor(interval(start= HH.First.Gift.Date, end=Sys.Date())/duration(n=1, unit="years")))) %>%
#major gift
mutate(major_gifter = ifelse(Lifetime.Giving > 50000, 1,0) %>% factor(., levels = c("0","1"))) %>%
#splitting up the age into ranges and creating category for easy visualization
mutate(age_range =
ifelse(Age %in% 10:19, "10 < 20 years old",
ifelse(Age %in% 20:29, "20 < 30 years old",
ifelse(Age %in% 30:39, "30 < 40 years old",
ifelse(Age %in% 40:49, "40 < 50 years old",
ifelse(Age %in% 50:59, "50 < 60 years old",
ifelse(Age %in% 60:69, "60 < 70 years old",
ifelse(Age %in% 70:79, "70 < 80 years old",
ifelse(Age %in% 80:89, "80 < 90 years old",
ifelse(Age %in% 90:120, "90+ years old",
NA)))))))))) %>%
#creating a region column using the county data and the OMB MSA (Metropolitan Statistical Area) definitions
mutate(region =
ifelse(County == "San Luis Obispo" & State == "CA", "So Cal",
ifelse(County == "Kern" & State == "CA", "So Cal",
ifelse(County == "San Bernardino" & State == "CA", "So Cal",
ifelse(County == "Santa Barbara" & State == "CA", "So Cal",
ifelse(County == "Ventura" & State == "CA", "So Cal",
ifelse(County == "Los Angeles" & State == "CA", "So Cal",
ifelse(County == "Orange" & State == "CA", "So Cal",
ifelse(County == "Riverside" & State == "CA", "So Cal",
ifelse(County == "San Diego" & State == "CA", "So Cal",
ifelse(County == "Imperial" & State == "CA", "So Cal",
ifelse(County == "King" & State == "WA", "Seattle",
ifelse(County == "Snohomish" & State == "WA", "Seattle",
ifelse(County == "Pierce" & State == "WA", "Seattle",
ifelse(County == "Clackamas" & State == "OR", "Portland",
ifelse(County == "Columbia" & State == "OR", "Portland",
ifelse(County == "Multnomah" & State == "OR", "Portland",
ifelse(County == "Washington" & State == "OR", "Portland",
ifelse(County == "Yamhill" & State == "OR", "Portland",
ifelse(County == "Clark" & State == "WA", "Portland",
ifelse(County == "Skamania" & State == "WA", "Portland",
ifelse(County == "Denver" & State == "CO", "Denver",
ifelse(County == "Arapahoe" & State == "CO", "Denver",
ifelse(County == "Jefferson" & State == "CO", "Denver",
ifelse(County == "Adams" & State == "CO", "Denver",
ifelse(County == "Douglas" & State == "CO", "Denver",
ifelse(County == "Broomfield" & State == "CO", "Denver",
ifelse(County == "Elbert" & State == "CO", "Denver",
ifelse(County == "Park" & State == "CO", "Denver",
ifelse(County == "Clear Creek" & State == "CO", "Denver",
ifelse(County == "Alameda" & State == "CA", "Bay Area",
ifelse(County == "Contra Costa" & State == "CA", "Bay Area",
ifelse(County == "Marin" & State == "CA", "Bay Area",
ifelse(County == "Monterey" & State == "CA", "Bay Area",
ifelse(County == "Napa" & State == "CA", "Bay Area",
ifelse(County == "San Benito" & State == "CA", "Bay Area",
ifelse(County == "San Francisco" & State == "CA", "Bay Area",
ifelse(County == "San Mateo" & State == "CA", "Bay Area",
ifelse(County == "Santa Clara" & State == "CA", "Bay Area",
ifelse(County == "Santa Cruz" & State == "CA", "Bay Area",
ifelse(County == "Solano" & State == "CA", "Bay Area",
ifelse(County == "Sonoma" & State == "CA", "Bay Area",
NA)))))))))))))))))))))))))))))))))))))))))) %>%
mutate(region =
ifelse(County == "Kings" & State == "NY", "New York",
ifelse(County == "Queens" & State == "NY", "New York",
ifelse(County == "New York" & State == "NY", "New York",
ifelse(County == "Bronx" & State == "NY", "New York",
ifelse(County == "Richmond" & State == "NY", "New York",
ifelse(County == "Westchester" & State == "NY", "New York",
ifelse(County == "Bergen" & State == "NY", "New York",
ifelse(County == "Hudson" & State == "NY", "New York",
ifelse(County == "Passaic" & State == "NY", "New York",
ifelse(County == "Putnam" & State == "NY", "New York",
ifelse(County == "Rockland" & State == "NY", "New York",
ifelse(County == "Suffolk" & State == "NY", "New York",
ifelse(County == "Nassau" & State == "NY", "New York",
ifelse(County == "Middlesex" & State == "NJ", "New York",
ifelse(County == "Monmouth" & State == "NJ", "New York",
ifelse(County == "Ocean" & State == "NJ", "New York",
ifelse(County == "Somerset" & State == "NJ", "New York",
ifelse(County == "Essex" & State == "NJ", "New York",
ifelse(County == "Union" & State == "NJ", "New York",
ifelse(County == "Morris" & State == "NJ", "New York",
ifelse(County == "Sussex" & State == "NJ", "New York",
ifelse(County == "Hunterdon" & State == "NJ", "New York",
ifelse(County == "Pike" & State == "NJ", "New York",
region)))))))))))))))))))))))) %>%
# code nor cal region as all others in CA not already defined
mutate(region =
ifelse(State == "CA" & is.na(region) == TRUE, "Nor Cal", region))
#Initial Removal of Columns that provide no benefit
dataclean <- subset(dataclean,select = -c(Assignment.Number
,Assignment.has.Historical.Mngr
,Suffix
,Assignment.Date
,Assignment.Manager
,Assignment.Role
,Assignment.Title
,Assignment.Status
,Strategy
,Progress.Level
,Assignment.Group
,Assignment.Category
,Funding.Method
,Expected.Book.Date
,Qualification.Amount
,Expected.Book.Amount
,Expected.Book.Date
,Hard.Gift.Total
,Soft.Credit.Total
,Total.Assignment.Gifts
,No.of.Pledges
,Proposal..
,Proposal.Notes
,HH.Life.Spouse.Credit
,Last.Contact.By.Manager
,X..of.Contacts.By.Manager
,DonorSearch.Range
,iWave.Range
,WealthEngine.Range
,Philanthropic.Commitments
))
#cleaning up zip codes removing -4 after
dataclean$Zip <- gsub(dataclean$Zip, pattern="-.*", replacement = "")
#adding zip code data and column
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding zip salary column
dataclean <-dataclean %>%
mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E)) %>%
#slry range
mutate(zipslry_range =
ifelse(zipcode_slry %in% 10000:89999, "90K-99K",
ifelse(zipcode_slry %in% 90000:99999, "90K-99K",
ifelse(zipcode_slry %in% 100000:149999, "100K-149K",
ifelse(zipcode_slry %in% 150000:199999, "150K-199K",
ifelse(zipcode_slry %in% 200000:249999, "200K-249K",
ifelse(zipcode_slry %in% 250000:299999, "250K-299K",
ifelse(zipcode_slry %in% 300000:349999, "300K-349K",
ifelse(zipcode_slry %in% 350000:399999, "350K-399K",
ifelse(zipcode_slry %in% 400000:499999, "400K-499K",
ifelse(zipcode_slry %in% 500000:999999, "500K-999K",
NA)))))))))))
sum(is.na(dataclean$zipcode_slry))
[1] 62347
#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding scholarship column
dataclean <-dataclean %>%
mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP))
#replacing NA with 0
dataclean$scholarship <- replace_na(dataclean$scholarship,'0')
#replacing Y with 1
dataclean$scholarship<-ifelse(dataclean$scholarship=="Y",1,0)
#checking how many are N
table(dataclean$scholarship)
0 1
295264 27962
#checking and deleting scholarship column
class(dataclean$schlr_fct)
[1] "NULL"
dataclean = subset(dataclean, select = -c(scholarship))
#checking for duplicates N >1 indicates a records values are in the file twice
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
#removing duplicated records
dataclean <- unique(dataclean)
#Verifying n = 1 no ID with multiple records cleaned of dupes
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
NA
1d Creating many many factor variables
dataclean <-
dataclean %>%
#SEX
mutate(sex_fct =
fct_explicit_na(Sex),
sex_simple =
fct_lump_n(Sex, n = 4),
#MARRIED
married_fct =
fct_explicit_na(Married),
#DONOR SEGMENT
donorseg_fct =
fct_explicit_na(Donor.Segment),
donorseg_simple =
fct_lump_n(Donor.Segment, n = 4),
#CONTACT RULE
contact_fct =
fct_explicit_na(Contact.Rules),
contact_simple =
fct_lump_n(Contact.Rules, n = 4),
#SPOUSE MAIL
spomail_fct =
fct_explicit_na(Spouse.Mail.Rules),
spomail_simple =
fct_lump_n(Spouse.Mail.Rules, n = 4),
#JOB TITLE
jobtitle_fct =
fct_explicit_na(Job.Title),
jobtitle_simple =
fct_lump_n(Job.Title, n = 5),
#DEGREE TYPE 1
deg1_fct =
fct_explicit_na(Degree.Type.1),
deg1_simple =
fct_lump_n(Degree.Type.1, n = 5),
#DEGREE TYPE 2
deg2_fct =
fct_explicit_na(Degree.Type.2),
deg2_simple =
fct_lump_n(Degree.Type.2, n = 5),
#MAJOR 1
maj1_fct =
fct_explicit_na(Major.1),
maj1_simple =
fct_lump_n(Major.1, n = 5),
#MAJOR 2
maj2_fct =
fct_explicit_na(Major.2),
maj2_simple =
fct_lump_n(Major.2, n = 5),
#MINOR 1
min1_fct =
fct_explicit_na(Minor.1),
min1_simple =
fct_lump_n(Minor.1, n = 5),
#MINOR 2
min2_fct =
fct_explicit_na(Minor.2),
min2_simple =
fct_lump_n(Minor.2, n = 5),
#SCHOOL 1
school1_fct =
fct_explicit_na(School.1),
school1_simple =
fct_lump_n(School.1, n = 5),
#SCHOOL 2
school2_fct =
fct_explicit_na(School.2),
school2_simple =
fct_lump_n(School.2, n = 5),
#INSTITUTION TYPE
insttype_fct =
fct_explicit_na(Institution.Type),
insttype_simple =
fct_lump_n(Institution.Type, n = 5),
#EXTRACURRICULAR
extra_fct =
fct_explicit_na(Extracurricular),
extra_simple =
fct_lump_n(Extracurricular, n = 5),
#HH FIRST GIFT FUND
hhfirstgift_fct =
fct_explicit_na(HH.First.Gift.Fund),
hhfirstgift_simple =
fct_lump_n(HH.First.Gift.Fund, n = 5),
#CHILD 1 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.1.Enroll.Status),
ch1_enroll_simple =
fct_lump_n(Child.1.Enroll.Status, n = 4),
#CHILD 1 MAJOR
ch1_maj_fct =
fct_explicit_na(Child.1.Major),
ch1_maj_simple =
fct_lump_n(Child.1.Major, n = 4),
#CHILD 1 MINOR
ch1_min_fct =
fct_explicit_na(Child.1.Minor),
ch1_min_simple =
fct_lump_n(Child.1.Minor, n = 4),
#CHILD 1 SCHOOL
ch1_school_fct =
fct_explicit_na(Child.1.School),
ch1_school_simple =
fct_lump_n(Child.1.School, n = 4),
#CHILD 1 FEEDER
ch1_feeder_fct =
fct_explicit_na(Child.1.Feeder.School),
ch1_feeder_simple =
fct_lump_n(Child.1.Feeder.School, n = 4),
#CHILD 2 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.2.Enroll.Status),
ch2_enroll_simple =
fct_lump_n(Child.2.Enroll.Status, n = 4),
#CHILD 2 MAJOR
ch2_maj_fct =
fct_explicit_na(Child.2.Major),
ch2_maj_simple =
fct_lump_n(Child.2.Major, n = 4),
#CHILD 2 MINOR
ch2_min_fct =
fct_explicit_na(Child.2.Minor),
ch2_min_simple =
fct_lump_n(Child.2.Minor, n = 4),
#CHILD 2 SCHOOL
ch2_school_fct =
fct_explicit_na(Child.2.School),
ch2_school_simple =
fct_lump_n(Child.2.School, n = 4),
#CHILD 2 FEEDER
ch2_feeder_fct =
fct_explicit_na(Child.2.Feeder.School),
ch2_feeder_simple =
fct_lump_n(Child.2.Feeder.School, n = 4),
)
#checking to see if its a factor
#class(dataclean$sex_fct)
#class(dataclean$donorseg_fct)
#class(dataclean$contact_fct)
#class(dataclean$spomail_fct)
#checking levels
#levels(dataclean$sex_simple)
#levels(dataclean$donorseg_simple)
#levels(dataclean$contact_simple)
#levels(dataclean$spomail_simple)
#levels(dataclean$hhfirstgift_simple)
#creating a table against Sex column
#table(dataclean$sex_fct, dataclean$sex_simple)
Region Analysis
#grouping by region and analyzing
dataclean %>%
group_by(region) %>%
summarise(Count = length(region),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
mutate(mean_total_giv = dollar(mean_total_giv)) %>%
kable(col.names = c("Region", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Region | Count | Mean HH Lifetime Giving |
|---|---|---|
| So Cal | 145139 | $5,090.84 |
| NA | 130306 | $2,040.98 |
| Bay Area | 20641 | $755.92 |
| Nor Cal | 10707 | $3,823.63 |
| Seattle | 5425 | $922.08 |
| New York | 4959 | $1,978.49 |
| Portland | 2976 | $1,098.24 |
| Denver | 2847 | $257.29 |
NA
NA
DonorSegment Analysis
#grouping by donorsegment and analyzing
dataclean %>%
group_by(Donor.Segment) %>%
summarise(Count = length(Donor.Segment),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
#added scales package to have the values show in dollar
mutate(mean_total_giv = (dollar(mean_total_giv))) %>%
kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Donor Segment | Count | Mean HH Lifetime Giving |
|---|---|---|
| NA | 231974 | $0.00 |
| Lost Donor | 69718 | $4,954.47 |
| Lapsed Donor | 11193 | $10,069.79 |
| Current Donor | 5603 | $90,638.32 |
| Lapsing Donor | 3862 | $16,590.15 |
| At-Risk Donor | 650 | $77,143.93 |
NA
NA
First gift size
aq <- quantile(dataclean$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
aq <- as.data.frame(aq)
aq$aq <- dollar(aq$aq)
aq %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | $0.00 |
| 50% | $0.00 |
| 75% | $0.00 |
| 90% | $40.00 |
| 99% | $1,910.06 |
NA
NA
Consecutive giving
#consecutive years of giving
dataclean %>%
filter(Max.Consec.Fiscal.Years > 0) %>%
ggplot(aes(Max.Consec.Fiscal.Years)) + geom_histogram(fill = "#002845", bins = 20) +
theme_economist_white() +
ggtitle("Consecutive Years of Giving Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,2)) +
scale_y_continuous(breaks = seq(0,10000000,5000))
NA
NA
NA
Lifetime giving based on number of children
dataclean %>%
filter(HH.Lifetime.Giving <= 10000) %>%
filter(HH.Lifetime.Giving > 0) %>%
mutate(`No_of_Children` = as.factor(`No_of_Children`)) %>%
ggplot(aes(HH.Lifetime.Giving, fill = `No_of_Children`)) + geom_histogram(bins = 30) + theme_economist_white() +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,100000,1000)) +
scale_y_continuous(breaks = seq(0,100000000,5000)) +
ggtitle("Giving distribution and number of children")+
scale_fill_manual(values=c("#002845", "#00cfcc", "#ff9973"))
NA
NA
NA
Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- dataclean %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
glimpse(age_range_giving)
Rows: 10
Columns: 4
$ age_range <chr> "10 < 20 years old", "20 < 30 …
$ avg_giving <dbl> 0.4464244, 28.2686916, 386.687…
$ med_giving <dbl> 0, 0, 0, 0, 0, 0, 0, 10, 15, 0
$ amount_of_people_in_age_range <int> 3977, 24556, 21022, 16834, 207…
2a) Plotting average giving by age range
age_range_giving <-
age_range_giving %>%
mutate(age_range = factor(age_range))
ggplot(age_range_giving, aes(age_range, avg_giving)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=45,
hjust=1)) + labs(x = "Age Range", y = "Average Giving") +
ggtitle("Average Giving Compared Across Age Ranges")
NA
NA
2b) Count of donors based on age range (another way to look at it)
ggplot(dataclean,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")
NA
NA
2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
ggplot(dataclean, aes(age_range,HH.Lifetime.Giving,fill = age_range)) +
geom_boxplot(
outlier.colour = "red") +
scale_y_log10() +
theme(axis.text.x=element_text(angle=45,hjust=1)) + labs(x = "Age Range", y = "Lifetime Giving Amount") +
ggtitle("Lifetime Giving Compared Across Age Ranges")
NA
NA
2d) Splitting by age and gender
#creating boxplots
dataclean %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)
Giving by gender
#remove NAs U X
dataclean2 <- dataclean %>%
filter(Sex %in% c("M", "F"))
q <- ggplot(dataclean2)
q + stat_summary_bin(
aes(y = HH.Lifetime.Giving, x = Sex),
fun.y = "mean", geom = "bar")
summary(dataclean$sex_simple)
Mean age by gender
#breakdown of sexs
tally(group_by(dataclean, Sex))
summarize(group_by(dataclean, Sex),
avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
avg_age = mean(Age, na.rm = TRUE),
med_age = median(Age, na.rm = TRUE))
#grouping by sex and age range for slides
tally(group_by(dataclean, Sex, age_range))
2e) Distribution of people in the states that they live.
dataclean %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
| Donor's State | Count |
|---|---|
| CA | 176487 |
| WA | 7957 |
| TX | 7266 |
| NY | 5659 |
| CO | 5073 |
| AZ | 4925 |
| OR | 4612 |
| FL | 4111 |
| IL | 3681 |
| HI | 3394 |
| PA | 2904 |
| OH | 2754 |
| NV | 2715 |
| MI | 2523 |
| MA | 2473 |
| NJ | 2311 |
| VA | 2158 |
| NC | 2085 |
| GA | 2044 |
| MO | 1889 |
| MN | 1732 |
| MD | 1488 |
| TN | 1443 |
| IN | 1417 |
| CT | 1380 |
| WI | 1330 |
| UT | 1173 |
| OK | 1151 |
| AL | 1120 |
| LA | 1110 |
| ID | 1096 |
| SC | 1076 |
| KY | 1032 |
| KS | 1027 |
| NM | 981 |
| IA | 880 |
NA
NA
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- dataclean %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
Split data and create a new set for easier analysis
#converting married Y and N to 1 and 0
dataclean <- dataclean %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
dataclean <- dataclean %>%
mutate(hh.lifetime.giving_fct = as.factor(HH.Lifetime.Giving)) %>%
mutate(HH.Lifetime.Giving.Plus = log(HH.Lifetime.Giving + 1))
#Creating the velvet set - only the best can enter
datavelvet <- subset(dataclean,select = -c(
age_range,
Athletics,
Birthdate,
Category.Codes,
Category.Descriptions,
Child.1.Enroll.Status,
Child.1.ID,
Child.2.ID,
Child.1.Major,
Child.1.Minor,
Child.1.School,
Child.1.Feeder.School,
Child.2.Enroll.Status,
Child.2.Major,
Child.2.Minor,
Child.2.School,
Child.2.Feeder.School,
ch1_enroll_fct,
ch1_maj_fct,
ch1_min_fct,
ch1_school_fct,
ch1_feeder_fct,
ch2_school_fct,
ch2_feeder_fct,
ch2_maj_fct,
ch2_min_fct,
Contact.Rules,
Degree.Type.1,
Degree.Type.2,
Donor.Segment,
City,
County,
Conferral.Date.1,
Conferral.Date.2,
Contact.Rules,
contact_fct,
Degree.Type.1,
Degree.Type.2,
deg1_fct,
deg2_fct,
donorseg_fct,
Extracurricular,
extra_fct,
HH.First.Gift.Fund,
HH.First.Gift.Date,
hhfirstgift_fct,
hh.lifetime.giving_fct,
ID,
Institution.Type,
insttype_fct,
Job.Title,
jobtitle_fct,
Last.Contact.By.Anyone,
LegacyLeader..compass.score.,
major_gifter,
Major.1,
Major.2,
Minor.1,
Minor.2,
maj1_fct,
maj2_fct,
min1_fct,
min2_fct,
region,
zipcode_slry,
Sex,
Scholarship,
School.1,
School.2,
school1_fct,
school2_fct,
Spouse.Birthdate,
Spouse.Mail.Rules,
spomail_fct,
State,
Zip,
zipslry_range
))
#datavelvet <-
#datavelvet[sapply(datavelvet, is.character)] <- #lapply(datavelvet[sapply(datavelvet, is.character)],
# as.factor)
library("rsample")
data_split <- initial_split(datavelvet, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
p <- datavelvet %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x',
which will replace the existing scale.
ggplotly(p)
p
ggplot(data = datavelvet, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NA
NA
NA
Another Histogram
datavelvet %>%
filter(Age >= 10) %>%
filter(Age <= 90) %>%
ggplot(aes(Age)) + geom_histogram(fill = "#002845", bins = 20) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,10000000,2000))
Age distribution by gender
#Age Gender filtered out below 15 and above 90 - also removed U X the weird values
datavelvet %>%
filter(Age >= 15) %>%
filter(Age <= 90) %>%
mutate(Sex = as.factor(Sex)) %>%
filter(Sex != "U") %>%
filter(Sex != "X") %>%
ggplot(aes(Age, fill = Sex)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Age Distribution by Gender") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,10)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Donor age distribution by marital status
#Age Marital Status
datavelvet %>%
filter(Age >= 20) %>%
filter(Age <= 85) %>%
ggplot(aes(Age, fill = Married)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution by Marital Status") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Linear Model
#These will focus on predicting whether a constituent is a donor or non-donor.
mod1lm <- lm( Lifetime.Giving ~ Married_simple,
data = data_train)
mod2lm <- lm( Total.Giving.Years ~ Lifetime.Giving,
data = data_train)
mod3lm <- lm( Lifetime.Giving ~ region,
data = data_train)
summary(mod1lm)
Call:
lm(formula = Lifetime.Giving ~ Married_simple, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-2867 -2742 -2661 -2661 18111464
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2660.9 251.3 10.588 <0.0000000000000002 ***
Married_simple 205.9 469.1 0.439 0.661
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 104400 on 242248 degrees of freedom
Multiple R-squared: 7.953e-07, Adjusted R-squared: -3.333e-06
F-statistic: 0.1927 on 1 and 242248 DF, p-value: 0.6607
summary(mod2lm)
Call:
lm(formula = Total.Giving.Years ~ Lifetime.Giving, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-36.600 -0.554 -0.554 -0.554 39.403
Coefficients:
Estimate Std. Error t value
(Intercept) 0.55445026328 0.00396511550 139.83
Lifetime.Giving 0.00000343205 0.00000003795 90.43
Pr(>|t|)
(Intercept) <0.0000000000000002 ***
Lifetime.Giving <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.951 on 242248 degrees of freedom
Multiple R-squared: 0.03266, Adjusted R-squared: 0.03265
F-statistic: 8178 on 1 and 242248 DF, p-value: < 0.00000000000000022
summary(mod3lm)
Call:
lm(formula = Lifetime.Giving ~ region, data = data_train)
Residuals:
Min 1Q Median 3Q Max
-3977 -3968 -3968 -3598 18110156
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 513.0 950.9 0.539 0.589558
regionDenver -367.7 2739.3 -0.134 0.893220
regionNew York 1954.2 2160.7 0.904 0.365769
regionNor Cal 3464.0 1623.1 2.134 0.032826 *
regionPortland 161.0 2680.2 0.060 0.952111
regionSeattle -128.2 2088.2 -0.061 0.951057
regionSo Cal 3455.5 1016.1 3.401 0.000672 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 118200 on 144684 degrees of freedom
(97559 observations deleted due to missingness)
Multiple R-squared: 0.0001214, Adjusted R-squared: 7.989e-05
F-statistic: 2.927 on 6 and 144684 DF, p-value: 0.007435
#increasing the giving year one year increase total giving by 0.0035
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~region) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Region")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~nmb_degree) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Number of Degrees")
`geom_smooth()` using formula 'y ~ x'
ggplot(data = data_train, aes(x = Age, y = log(HH.First.Gift.Amount))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~donorseg_fct) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Donor Segment")
`geom_smooth()` using formula 'y ~ x'
#This plot actually has some interesting results
ggplot(data = data_train, aes(x = Age, y = log(Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~No_of_Children) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("# Children")
`geom_smooth()` using formula 'y ~ x'
data_train %>%
select_if(is.factor) %>%
glimpse()
Rows: 242,250
Columns: 54
$ major_gifter <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ sex_fct <fct> F, M, F, M, (Missing), M, M, (Missing…
$ sex_simple <fct> F, M, F, M, NA, M, M, NA, M, NA, M, M…
$ married_fct <fct> Y, Y, N, N, N, N, N, N, N, N, N, N, N…
$ donorseg_fct <fct> Lost Donor, (Missing), (Missing), (Mi…
$ donorseg_simple <fct> Lost Donor, NA, NA, NA, NA, Lost Dono…
$ contact_fct <fct> No Solicitations, (Missing), (Missing…
$ contact_simple <fct> No Solicitations, NA, NA, NA, NA, No …
$ spomail_fct <fct> No Solicitations, (Missing), (Missing…
$ spomail_simple <fct> No Solicitations, NA, NA, NA, NA, NA,…
$ jobtitle_fct <fct> (Missing), Manager, (Missing), Public…
$ jobtitle_simple <fct> NA, Other, NA, Other, NA, NA, NA, NA,…
$ deg1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ deg1_simple <fct> NA, NA, NA, NA, NA, Bachelor of Arts,…
$ deg2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ deg2_simple <fct> NA, NA, NA, NA, NA, Master of Arts, N…
$ maj1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ maj1_simple <fct> NA, NA, NA, NA, NA, Other, Law (Full-…
$ maj2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ maj2_simple <fct> NA, NA, NA, NA, NA, Other, NA, NA, NA…
$ min1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ min1_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ min2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ min2_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ school1_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ school1_simple <fct> NA, NA, NA, NA, NA, NA, Other, NA, NA…
$ school2_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ school2_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ insttype_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ insttype_simple <fct> NA, NA, NA, NA, NA, NA, Law JD Full-T…
$ extra_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ extra_simple <fct> NA, NA, NA, NA, NA, Other, NA, NA, NA…
$ hhfirstgift_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ hhfirstgift_simple <fct> NA, NA, NA, NA, NA, Pre-SRN Conversio…
$ ch1_enroll_fct <fct> (Missing), Program Completed, (Missin…
$ ch1_enroll_simple <fct> NA, NA, NA, NA, NA, Program Completed…
$ ch1_maj_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_maj_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch1_min_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_min_simple <fct> NA, NA, NA, NA, NA, Non-Degree: GR Ta…
$ ch1_school_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch1_school_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch1_feeder_fct <fct> (Missing), Palm Beach State College, …
$ ch1_feeder_simple <fct> NA, Other, NA, NA, NA, NA, NA, NA, NA…
$ ch2_enroll_simple <fct> NA, Program Completed, NA, NA, NA, NA…
$ ch2_maj_fct <fct> (Missing), Business Administration BS…
$ ch2_maj_simple <fct> NA, Business Administration BS, NA, N…
$ ch2_min_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch2_min_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ ch2_school_fct <fct> (Missing), George L. Argyros School o…
$ ch2_school_simple <fct> NA, George L. Argyros School of Busin…
$ ch2_feeder_fct <fct> (Missing), (Missing), (Missing), (Mis…
$ ch2_feeder_simple <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ hh.lifetime.giving_fct <fct> 25, 0, 0, 0, 0, 8048.75, 0, 0, 0, 0, …
MORE MODELS
Big logistic model
# Set family to binomial to set logistic function
# Run the model on the training set
donor_logit1 <-
glm(hh.lifetime.giving_fct ~ Married_simple,
family = "binomial",
data = data_train)
summary(donor_logit1)
donor_logit2 <-
glm(hh.lifetime.giving_fct ~ No_of_Children,
family = "binomial",
data = data_train)
summary(donor_logit2)
#summary(data_train$major_gifter)
#Assignment_flag taken out - may add back
donor_logit3 <-
glm(major_gifter ~ Married + No_of_Children + donorseg_simple + Total.Giving.Years + nmb_degree,
family = "binomial",
data = data_train)
summary(donor_logit3)
Call:
glm(formula = major_gifter ~ Married + No_of_Children + donorseg_simple +
Total.Giving.Years + nmb_degree, family = "binomial", data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0162 -0.1346 -0.1189 -0.0647 4.2151
Coefficients:
Estimate Std. Error z value
(Intercept) -3.623014 0.232406 -15.589
MarriedY -1.196678 0.088444 -13.530
No_of_Children 0.610558 0.060093 10.160
donorseg_simpleCurrent Donor -0.086079 0.238167 -0.361
donorseg_simpleLapsed Donor -0.685781 0.244285 -2.807
donorseg_simpleLapsing Donor -0.491440 0.259772 -1.892
donorseg_simpleLost Donor -1.349829 0.232121 -5.815
Total.Giving.Years 0.206177 0.005601 36.809
nmb_degree -2.493578 0.146982 -16.965
Pr(>|z|)
(Intercept) < 0.0000000000000002 ***
MarriedY < 0.0000000000000002 ***
No_of_Children < 0.0000000000000002 ***
donorseg_simpleCurrent Donor 0.7178
donorseg_simpleLapsed Donor 0.0050 **
donorseg_simpleLapsing Donor 0.0585 .
donorseg_simpleLost Donor 0.00000000606 ***
Total.Giving.Years < 0.0000000000000002 ***
nmb_degree < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9606.7 on 68301 degrees of freedom
Residual deviance: 7052.9 on 68293 degrees of freedom
(173948 observations deleted due to missingness)
AIC: 7070.9
Number of Fisher Scoring iterations: 8
exp(donor_logit3$coefficients)
(Intercept) MarriedY
0.02670207 0.30219634
No_of_Children donorseg_simpleCurrent Donor
1.84145937 0.91752216
donorseg_simpleLapsed Donor donorseg_simpleLapsing Donor
0.50369656 0.61174479
donorseg_simpleLost Donor Total.Giving.Years
0.25928458 1.22897028
nmb_degree
0.08261383
#training predictions for in sample preds
preds_train <- predict(donor_logit3, newdata = data_train, type = "response")
#test predicts for OOS (out of sample)
preds_test <- predict(donor_logit3, newdata = data_test, type = "response")
head(preds_train)
236259 260845 287321 219737 27673
0.015426706 0.009114444 NA NA NA
298107
NA
head(preds_test)
4 10 11 14 15
0.132366655 0.002564702 0.888420638 0.136046442 0.742195040
18
0.002087869
results_train <- data.frame(
`truth` = data_train %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_train,
`type` = rep("train",length(preds_train))
)
results_test <- data.frame(
`truth` = data_test %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_test,
`type` = rep("test",length(preds_test))
)
results <- bind_rows(results_train,results_test)
dim(results_train)
[1] 242250 3
dim(results_test)
[1] 80750 3
dim(results)
[1] 323000 3
library('plotROC')
p_plot <-
ggplot(results,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 2.5,
#Took the labelsize down to avoid cutoff
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
#We removed some of the cutoffs to avoid the mashup near the origin.
#Changed the theme to avoid cutoff plot values.
theme_classic(base_size = 14) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
print(p_plot)
p_train <-
ggplot(results_train,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
p_test <-
ggplot(results_test,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
#summary(donor_logit3)
#coef(donor_logit3)
#Calculating AUC of both
print(calc_auc(p_train)$AUC)
[1] 0.8918463
print(calc_auc(p_test)$AUC)
[1] 0.8839279
RIDGE
library('glmnet')
library('glmnetUtils')
ridge_fit1 <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + donorseg_fct + No_of_Children,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit1)
Call:
cv.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ sex_fct +
donorseg_fct + No_of_Children, data = data_train, alpha = 0)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Number of crossvalidation folds: 10
Alpha: 0
Deviance-minimizing lambda: 0.2204993 (+1 SE): 0.2914871
print(ridge_fit1$lambda.min)
[1] 0.2204993
print(ridge_fit1$lambda.1se)
[1] 0.2914871
plot(ridge_fit1)
BIG RIDGE
library('glmnet')
library('glmnetUtils')
data_train %>% map(levels) %>% map(length)
$HH.Total.Gifts.FY20.21
[1] 0
$HH.Total.Gifts.FY19.20
[1] 0
$HH.Total.Gifts.FY18.19
[1] 0
$HH.Total.Gifts.FY17.18
[1] 0
$HH.Total.Gifts.FY16.17
[1] 0
$Class.Year
[1] 0
$Spouse.Class.Year
[1] 0
$Child.1.Class.Year
[1] 0
$Child.2.Class.Year
[1] 0
$Lifetime.Giving
[1] 0
$HH.Lifetime.Giving
[1] 0
$Total.Giving.Years
[1] 0
$Total.Giving.Fiscal.Years
[1] 0
$Max.Consec.Fiscal.Years
[1] 0
$HH.Life.Hard.Credit
[1] 0
$HH.Life.Soft.Credit
[1] 0
$HH.First.Gift.Amount
[1] 0
$Months.Since.Last.Gift
[1] 0
$Compass.Score
[1] 0
$Age
[1] 0
$Spouse.Age
[1] 0
$Assignment_flag
[1] 0
$No_of_Children
[1] 0
$nmb_degree
[1] 0
$Conferral.Date.1.Age
[1] 0
$Conferral.Date.2.Age
[1] 0
$Last.Contact.Age
[1] 0
$HH.First.Gift.Age
[1] 0
$sex_simple
[1] 4
$married_fct
[1] 2
$donorseg_simple
[1] 5
$contact_simple
[1] 5
$spomail_simple
[1] 5
$jobtitle_simple
[1] 6
$deg1_simple
[1] 6
$deg2_simple
[1] 6
$maj1_simple
[1] 6
$maj2_simple
[1] 6
$min1_simple
[1] 6
$min2_simple
[1] 7
$school1_simple
[1] 6
$school2_simple
[1] 6
$insttype_simple
[1] 6
$extra_simple
[1] 6
$hhfirstgift_simple
[1] 6
$ch1_enroll_simple
[1] 5
$ch1_maj_simple
[1] 5
$ch1_min_simple
[1] 5
$ch1_school_simple
[1] 5
$ch1_feeder_simple
[1] 5
$ch2_enroll_simple
[1] 5
$ch2_maj_simple
[1] 5
$ch2_min_simple
[1] 5
$ch2_school_simple
[1] 5
$ch2_feeder_simple
[1] 5
$Married_simple
[1] 0
$HH.Lifetime.Giving.Plus
[1] 0
ridge_fit2 <- cv.glmnet(HH.Lifetime.Giving.Plus ~ Age + school1_simple + insttype_simple + extra_simple + ch1_maj_simple + ch2_maj_simple + Married_simple + donorseg_simple + married_fct + sex_simple + nmb_degree,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit2)
Call:
cv.glmnet.formula(formula = HH.Lifetime.Giving.Plus ~ Age + school1_simple +
insttype_simple + extra_simple + ch1_maj_simple + ch2_maj_simple +
Married_simple + donorseg_simple + married_fct + sex_simple +
nmb_degree, data = data_train, alpha = 0)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Number of crossvalidation folds: 10
Alpha: 0
Deviance-minimizing lambda: 11.86158 (+1 SE): 1186.158
print(ridge_fit2$lambda.min)
[1] 11.86158
print(ridge_fit2$lambda.1se)
[1] 1186.158
plot(ridge_fit2)
LASSO
#Using cv.glmnet from class
#ls(data_train)
#is.factor(data_train$major_gifter)
#glimpse(data_train$Lifetime.Giving)
#data_train %>%
# select_if(is.factor) %>%
# glimpse()
library(glmnet)
library(glmnetUtils)
lasso_fit <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + jobtitle_simple + nmb_degree + school1_simple + hhfirstgift_simple + maj1_simple + donorseg_simple + No_of_Children + Married,
data = data_train,
#Alpha 1 for lasso
alpha = 1)
print(lasso_fit$lambda.min)
[1] 0.01891898
#
print(lasso_fit$lambda.1se)
[1] 0.1009649
plot(lasso_fit)
LASSO 2
library(glmnet)
library(glmnetUtils)
lasso_fit <- cv.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + Age + school1_simple + insttype_simple + extra_simple + Married + donorseg_simple + nmb_degree+ No_of_Children,
data = data_train,
#Alpha 1 for lasso
alpha = 1)
print(lasso_fit$lambda.min)
[1] 0.004309598
#
print(lasso_fit$lambda.1se)
[1] 0.06399617
plot(lasso_fit)
coef(lasso_fit)
43 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 6.822196
Age .
school1_simpleCollege of Health and Behavioral Sciences .
school1_simpleDonna Ford Attallah College of Educational Studies .
school1_simpleGeorge L. Argyros School of Business and Economics .
school1_simpleLawrence and Kristina Dodge Coll of Film & Media .
school1_simpleWilkinson Coll of Arts Humanities & Soc Sciences .
school1_simpleOther .
insttype_simpleGraduate Degree .
insttype_simpleLaw JD Full-Time Program .
insttype_simpleUndergraduate Degree .
insttype_simpleUndergraduate Degree | Undergraduate Degree .
insttype_simpleUndergraduate Degree | Undergraduate Degree | Undergraduate Degree .
insttype_simpleOther .
extra_simpleChapman Choir Tour .
extra_simpleDisciples on Campus .
extra_simpleFootball .
extra_simpleInternational Student .
extra_simpleWorld Campus Afloat/Sem at Sea .
extra_simpleOther .
ch1_maj_simpleBusiness Administration BS .
ch1_maj_simpleCommunication Studies BA .
ch1_maj_simpleFilm Production BFA .
ch1_maj_simplePsychology BA .
ch1_maj_simpleOther .
ch2_maj_simpleBusiness Administration BS .
ch2_maj_simpleCommunication Studies BA .
ch2_maj_simpleFilm Production BFA .
ch2_maj_simplePsychology BA .
ch2_maj_simpleOther .
Married_simple .
donorseg_simpleAt-Risk Donor .
donorseg_simpleCurrent Donor .
donorseg_simpleLapsed Donor .
donorseg_simpleLapsing Donor .
donorseg_simpleLost Donor .
married_fctN .
married_fctY .
sex_simpleF .
sex_simpleM .
sex_simpleU .
sex_simpleX .
nmb_degree .
#Default setting is lambda.1se
#From the book - showing convergence with lambda values
plot(lasso_fit$glmnet.fit, xvar="lambda")
#abline(v=log(c(lasso_fit$lambda.min, lasso_fit$lambda.1se)), lty=2)
enet_mod <- cva.glmnet(HH.Lifetime.Giving.Plus ~ sex_fct + jobtitle_simple + nmb_degree + school1_simple + hhfirstgift_simple + maj1_simple + donorseg_simple + No_of_Children + Married,
data = data_train,
alpha = seq(0,1, by = 0.1))
print(enet_mod)
plot(enet_mod)
ELASTICNET
minlossplot(enet_mod,
cv.type = "min")
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
get_model_params <- function(fit) {
alpha <- fit$alpha
lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
best <- which.min(error)
data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
lambdaSE = lambdaSE[best], eror = error[best])
}
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
get_model_params(enet_mod)
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]
print(best_mod)
minlossplot(enet_mod, cv.type = "min")
Ridges plot - could be useful for plotting donations vs donor segment
library('ggridges')
summary(data_train$variable)
ggplot(data_train, aes(x = HH.Lifetime.Giving, y = region)) + geom_density_ridges(rel_min_height = 0.005) + xlim(c(25000, 100000)) +
ggtitle("HH Lifetime Giving by Region")
library('corrplot')
#removing ID zip and nonnumeric
corrplot_data <- dataclean[-c(1:48,52:56,58:60,63,66:67,70:72,74:81,83:132)]
#Convert from character to numeric data type
convert_fac2num <- function(x){
as.numeric(as.factor(x))
}
corrplot_data <- mutate_at(corrplot_data,
.vars = c(1:12),
.funs = convert_fac2num)
#making a matrix
cd_cor <- cor(corrplot_data)
#creating correlation
col <- colorRampPalette(c("#BB4400", "#EE9990",
"#FFFFFF", "#77AAEE", "#4477BB"))
corrplot(cd_cor, method="color", col=col(100),
type="lower", addCoef.col = "black",
tl.pos="lt", tl.col="black",
tl.cex=0.7, tl.srt=45,
number.cex=0.7,
diag=FALSE)
#correlation matrix
# pairs(~Age + Months.Since.Last.Gift + donorseg_fct +
# nmb_degree + No_of_Children + HH.First.Gift.Age + HH.First.Gift.Amount + Total.Giving.Years,
# col = corrplot_data$HH.Lifetime.Giving,
# data = corrplot_data,
# main = "Donor Scatter Plot Matrix")
#worthless..
ggplot(data = corrplot_data, aes(x = nmb_degree, y = HH.Lifetime.Giving)) +
geom_point(aplha = 1/10)+
geom_smooth(method = "lm", color ="red")
Random Forest
library('randomForest')
rf_fit_donor <- randomForest(Lifetime.Giving ~ .,
data = data_train,
type = classification,
mtry = 7,
na.action = na.roughfix,
ntree = 200,
importance=TRUE
)
print(rf_fit_donor)
varImpPlot(rf_fit_donor, sort = TRUE,
n.var = 5,
type = 2, class = NULL, scale = TRUE,
main = deparse(substitute(rf_fit_donor)))
library('randomForestExplainer')
plot_min_depth_distribution(
rf_fit_donor,
k = 10,
min_no_of_trees = 0,
mean_sample = "top_trees",
mean_scale = FALSE,
mean_round = 2,
main = "Distribution of minimal depth and its mean"
)
#Splitting Category out to check if the category is useful for analysis
data_category_split_out <- dataclean %>%
mutate(Category.Codes = trim(strsplit(as.character(Category.Codes), "|", fixed = TRUE))) %>%
unnest(Category.Codes) %>% pivot_wider(names_from = Category.Codes,values_from =Category.Codes, values_fn = length)